Data preparation

Uni / Bivariate visualisations

Load data, calculate variables required, and calculate a dataset which allows you to calculate trend lines over time. That is, I infer which individuals were in the workforce in any one year based on their hire date and tenure.

# Loading and calculating data
data <- read.csv('GCR.csv') %>%
  dplyr::mutate(gender = factor(gender, levels = c("M", "F")),
                gradeAtHire = as.ordered(gradeAtHire),
         age = ageAtHire + tenure,
         gend_dept = paste(gender, department, sep = "_"),
         leaveYear = hireYear + tenure,
         term = ifelse(reason == "terminated", 1, 0))
# str(data)
summary(data)
##        id                department     ageAtHire       hireYear   
##  GCR1   :   1   Administrative:2534   Min.   :18.0   Min.   :1980  
##  GCR10  :   1   Operational   :2466   1st Qu.:27.0   1st Qu.:1990  
##  GCR100 :   1                         Median :36.0   Median :1999  
##  GCR1000:   1                         Mean   :36.1   Mean   :1999  
##  GCR1001:   1                         3rd Qu.:45.0   3rd Qu.:2009  
##  GCR1002:   1                         Max.   :59.0   Max.   :2018  
##  (Other):4994                                                      
##      tenure              reason     gender   gradeAtHire  current       
##  Min.   : 0.000   resigned  :3221   M:2810   1:1764      Mode :logical  
##  1st Qu.: 1.000   retired   : 269   F:2190   2:2002      FALSE:3952     
##  Median : 3.000   terminated: 462            3: 775      TRUE :1048     
##  Mean   : 5.759   NA's      :1048            4: 394                     
##  3rd Qu.: 8.000                              5:  65                     
##  Max.   :38.000                                                         
##                                                                         
##       age         gend_dept           leaveYear         term       
##  Min.   :18.00   Length:5000        Min.   :1980   Min.   :0.0000  
##  1st Qu.:32.00   Class :character   1st Qu.:1995   1st Qu.:0.0000  
##  Median :41.00   Mode  :character   Median :2006   Median :0.0000  
##  Mean   :41.85                      Mean   :2005   Mean   :0.1169  
##  3rd Qu.:51.00                      3rd Qu.:2017   3rd Qu.:0.0000  
##  Max.   :95.00                      Max.   :2018   Max.   :1.0000  
##                                                    NA's   :1048
# Workforce over time: ----
all_years <- expand.grid(year = unique(c(data$hireYear, data$leaveYear)),
                         id = unique(data$id))
data_timeline <- dplyr::full_join(data, all_years, by = "id") %>%
  dplyr::filter(hireYear <= year & leaveYear >= year) %>%
  dplyr::group_by(id) %>%
  dplyr::mutate(reason = ifelse(year != max(year), NA, as.character(reason)),
                current = ifelse(year != max(year), TRUE, FALSE),
                term = ifelse(year != max(year), NA, as.numeric(term)),
                new = hireYear == year)

# Proportion of women in the workforce any year: ----
workforce_prop_timeline <- data_timeline %>%
  dplyr::group_by(year, gender) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::mutate(prop_workforce = n / sum(n)) %>%
  dplyr::filter(gender == "F") %>%
  dplyr::select(year, prop_workforce)


# People who have left: ----
data_left <- data[!data$current, ]

We assume that the business was founded in 1980.

Data exploration

I explored univariate and bivariate distributions of variables I hypothesise may be related to each other and to gender using some basic visualisations. If you are interested, click the buttons below.

data_current <- data[data$current, ]
summary(data_current)
##        id                department    ageAtHire        hireYear   
##  GCR1   :   1   Administrative:409   Min.   :18.00   Min.   :1980  
##  GCR1002:   1   Operational   :639   1st Qu.:27.00   1st Qu.:2003  
##  GCR1004:   1                        Median :36.00   Median :2010  
##  GCR1017:   1                        Mean   :35.73   Mean   :2008  
##  GCR1019:   1                        3rd Qu.:44.00   3rd Qu.:2015  
##  GCR102 :   1                        Max.   :59.00   Max.   :2018  
##  (Other):1042                                                      
##      tenure             reason     gender  gradeAtHire current       
##  Min.   : 0.00   resigned  :   0   M:679   1:396       Mode:logical  
##  1st Qu.: 3.00   retired   :   0   F:369   2:424       TRUE:1048     
##  Median : 8.00   terminated:   0           3:150                     
##  Mean   :10.32   NA's      :1048           4: 71                     
##  3rd Qu.:15.25                             5:  7                     
##  Max.   :38.00                                                       
##                                                                      
##       age         gend_dept           leaveYear         term     
##  Min.   :18.00   Length:1048        Min.   :2018   Min.   : NA   
##  1st Qu.:36.00   Class :character   1st Qu.:2018   1st Qu.: NA   
##  Median :46.00   Mode  :character   Median :2018   Median : NA   
##  Mean   :46.05                      Mean   :2018   Mean   :NaN   
##  3rd Qu.:55.00                      3rd Qu.:2018   3rd Qu.: NA   
##  Max.   :95.00                      Max.   :2018   Max.   : NA   
##                                                    NA's   :1048
# Age by gender
ggplot(data_current, aes(x = gender, y = age))+
  geom_violin()+
  BIT_theme

# Tenure by gender
ggplot(data_current, aes(x = gender, y = tenure))+
  geom_violin()+
  BIT_theme

# Department by gender
ggplot(data_current, aes(x = gender, y = department))+
  geom_bin2d()+
  BIT_theme

# Grade at hire by gender
ggplot(data_current, aes(x = gender, y = gradeAtHire))+
  geom_bin2d()+
  BIT_theme

# Tenure by department
ggplot(data_current, aes(x = department, y = tenure))+
  geom_boxplot()+
  BIT_theme

# Age by department
ggplot(data_current, aes(x = department, y = age))+
  geom_violin()+
  BIT_theme

# Age by gender and department
ggplot(data_current, aes(x = gend_dept, y = tenure))+
  geom_violin()+
  BIT_theme

# Mainly the administrative dept has much less experienced women than men, much more similar in operations
ggplot(data_current, aes(x = gend_dept, y = age))+
  geom_violin()+
  BIT_theme

ggplot(data_current, aes(x = gend_dept, y = ageAtHire))+
  geom_violin()+
  BIT_theme

# Were there peak leaving years?
hist(data_left$leaveYear, breaks = length(unique(data_left$leaveYear)))

# There were some spikes: 1986, 2002, 2008/9
# Were women overrepresented in those years?
data_years <- data_left %>%
  dplyr::group_by(leaveYear, gender) %>%
  dplyr::summarise(n = n()) %>%
  tidyr::spread(gender, n) %>%
  dplyr::mutate(F_prop = `F` / (`F` + `M`))
ggplot(data_years, aes(x = leaveYear, y = F_prop))+
  geom_bar(stat = "identity")

# Not noticeably more women leaving in those years as a proportion of everyone leaving that year

# Age by gender
ggplot(data_left, aes(x = gender, y = age))+
  geom_violin()+
  BIT_theme

# Tenure by gender
ggplot(data_left, aes(x = gender, y = tenure))+
  geom_violin()+
  BIT_theme

# Department by gender
ggplot(data_left, aes(x = gender, y = department))+
  geom_bin2d()+
  BIT_theme

# Grade at hire by gender
ggplot(data_left, aes(x = gender, y = gradeAtHire))+
  geom_bin2d()+
  BIT_theme

# Tenure by department
ggplot(data_left, aes(x = department, y = tenure))+
  geom_boxplot()+
  BIT_theme

# Age by department
ggplot(data_left, aes(x = department, y = age))+
  geom_violin()+
  BIT_theme

# Age by gender and department
ggplot(data_left, aes(x = gend_dept, y = tenure))+
  geom_violin()+
  BIT_theme

# Mainly the administrative dept has much less experienced women than men, much more similar in operations
ggplot(data_left, aes(x = gend_dept, y = age))+
  geom_violin()+
  BIT_theme

ggplot(data_left, aes(x = gend_dept, y = ageAtHire))+
  geom_violin()+
  BIT_theme

Workforce over time

Calculate the proportion of women in each section of the workforce (department) in a given year, and compare that to the overall proportion of women in the whole workforce.

data_dept_timeline <- data_timeline  %>%
  dplyr::group_by(year, department, gender) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::mutate(prop = n / sum(n)) %>%
  dplyr::filter(gender == "F") %>%
  left_join(., workforce_prop_timeline, by = "year")
ggplot(data_dept_timeline, aes(x = year, y = prop, colour = department))+
  geom_line()+
  geom_smooth(alpha = 0.25)+
  geom_line(aes(x = year, y = prop_workforce), 
              colour = "red")+
  labs(title = "Proportion of female workforce over time",
       x = "Year",
       y = "Proportion of workers that were female",
       colour = "Department",
       subtitle = "GCR overall proportion in shown in red")+
  BIT_theme+
  scale_colour_manual(values = strong_palette)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Reasons for leaving

Starting from some basic density plots, I can calculate the proportion of leavers who are women for each separate leaving reason. This tells me whether there are specific reasons that women leave for more often than the general population. Because I saw above that there is a significant difference between the numbers of women between each department, I calculate the reasons separately for each department.

# Reasons for leaving by gender and department
ggplot(data_left, aes(x = gend_dept, y = reason))+
  geom_bin2d()+
  BIT_theme+
  scale_fill_gradientn(colors=strong_palette[1:2])

workforce_dept_prop_timeline <- data_timeline %>%
  dplyr::group_by(year, department, gender) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::mutate(prop_workforce = n / sum(n)) %>%
  dplyr::filter(gender == "F") %>%
  dplyr::select(leaveYear = year, department, prop_workforce)

all_reasons <- expand.grid(leaveYear = unique(data_left$leaveYear),
                           department = unique(data_left$department),
                           reason = unique(data_left$reason),
                           gender = unique(data_left$gender))
data_reasons <- data_left %>%
  dplyr::group_by(leaveYear, department, reason, gender) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::ungroup() %>%
  dplyr::full_join(., all_reasons, 
                   by = c("leaveYear", "department", "reason", "gender")) %>%
  dplyr::mutate(n = ifelse(is.na(n), 0, n)) %>%
  dplyr::group_by(leaveYear, department, reason) %>%
  dplyr::mutate(tot = sum(n)) %>%
  dplyr::ungroup() %>%
  dplyr::filter(gender == "F") %>%
  dplyr::mutate(prop = ifelse(tot > 0,
                              n / tot, 0)) %>%
  dplyr::left_join(., workforce_dept_prop_timeline, 
                   by = c("leaveYear", "department")) %>%
  dplyr::arrange(leaveYear, department, reason, gender)


ggplot(data_reasons, aes(x = leaveYear, y = prop, colour = reason))+
  geom_line(alpha = 0.5)+
  geom_smooth(se = FALSE)+
  geom_line(aes(x = leaveYear, y = prop_workforce), 
            colour = "red")+
  facet_wrap(department~reason)+
  labs(title = "Women as a % of all who leave for each reason",
       x = "Year of leaving",
       y = "Proportion of women",
       subtitle = "Department's overall proportion of women shown in red")+
  BIT_theme+
  scale_colour_manual(values = strong_palette)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Significant association between gender and being terminated?
reason_assoc <- chisq.test(table(data$gender, data$term))

There appears to be a strong association between gender and being terminated, at a Chi squared of 30.52, p < .001.

Data analysis

I want to dig deeper into this association between gender and being terminated. First, I created a set of mosaic plots to explore the bivariate association between termination and other variables. Click the button below to see those.

library("graphics")
data_dept <- as.table(as.matrix(table(data_left$department, data_left$term)))
mosaicplot(data_dept, shade = TRUE, las=2, main = "Department")

data_ageH <- as.table(as.matrix(table(data_left$ageAtHire, data_left$term)))
mosaicplot(data_ageH, shade = TRUE, las=2, main = "ageAtHire")

data_ageL <- as.table(as.matrix(table(data_left$age, data_left$term)))
mosaicplot(data_ageL, shade = TRUE, las=2, main = "ageWhenLeft")

data_year <- as.table(as.matrix(table(data_left$hireYear, data_left$term)))
mosaicplot(data_year, shade = TRUE, las=2, main = "hireYear")

data_leave <- as.table(as.matrix(table(data_left$leaveYear, data_left$term)))
mosaicplot(data_leave, shade = TRUE, las=2, main = "leaveYear")

# data_ten <- as.table(as.matrix(data_left$tenure, data_left$term))
# mosaicplot(data_ten, shade = TRUE, las=2, main = "Tenure")
data_gend <- as.table(as.matrix(table(data_left$gender, data_left$term)))
mosaicplot(data_gend, shade = TRUE, las=2, main = "Gender")

data_grade <- as.table(as.matrix(table(data_left$gradeAtHire, data_left$term)))
mosaicplot(data_grade, shade = TRUE, las=2, main = "GradeAtHire")

I then formalised these analyses with an ANOVA analysis on termination.

summary(aov(term ~ gradeAtHire * department * gender * ageAtHire * leaveYear, data = data_left))
##                                                     Df Sum Sq Mean Sq
## gradeAtHire                                          4    0.3  0.0866
## department                                           1    0.9  0.8950
## gender                                               1    2.3  2.3373
## ageAtHire                                            1    0.0  0.0363
## leaveYear                                            1    0.0  0.0004
## gradeAtHire:department                               3    0.1  0.0430
## gradeAtHire:gender                                   4    0.2  0.0603
## department:gender                                    1    0.0  0.0108
## gradeAtHire:ageAtHire                                4    0.8  0.2000
## department:ageAtHire                                 1    0.2  0.1544
## gender:ageAtHire                                     1    0.0  0.0172
## gradeAtHire:leaveYear                                4    0.1  0.0138
## department:leaveYear                                 1    0.1  0.0532
## gender:leaveYear                                     1    0.0  0.0262
## ageAtHire:leaveYear                                  1    0.0  0.0147
## gradeAtHire:department:gender                        3    0.3  0.0952
## gradeAtHire:department:ageAtHire                     3    0.3  0.1069
## gradeAtHire:gender:ageAtHire                         4    0.1  0.0313
## department:gender:ageAtHire                          1    0.1  0.1044
## gradeAtHire:department:leaveYear                     3    0.0  0.0057
## gradeAtHire:gender:leaveYear                         4    1.7  0.4151
## department:gender:leaveYear                          1    0.0  0.0422
## gradeAtHire:ageAtHire:leaveYear                      4    1.2  0.2915
## department:ageAtHire:leaveYear                       1    0.0  0.0240
## gender:ageAtHire:leaveYear                           1    0.0  0.0001
## gradeAtHire:department:gender:ageAtHire              3    0.1  0.0361
## gradeAtHire:department:gender:leaveYear              3    0.1  0.0284
## gradeAtHire:department:ageAtHire:leaveYear           3    0.4  0.1229
## gradeAtHire:gender:ageAtHire:leaveYear               4    0.1  0.0154
## department:gender:ageAtHire:leaveYear                1    0.0  0.0111
## gradeAtHire:department:gender:ageAtHire:leaveYear    3    0.1  0.0291
## Residuals                                         3880  398.4  0.1027
##                                                   F value  Pr(>F)    
## gradeAtHire                                         0.843 0.49759    
## department                                          8.716 0.00317 ** 
## gender                                             22.762 1.9e-06 ***
## ageAtHire                                           0.354 0.55207    
## leaveYear                                           0.004 0.95055    
## gradeAtHire:department                              0.419 0.73930    
## gradeAtHire:gender                                  0.587 0.67214    
## department:gender                                   0.105 0.74548    
## gradeAtHire:ageAtHire                               1.947 0.09981 .  
## department:ageAtHire                                1.504 0.22017    
## gender:ageAtHire                                    0.167 0.68243    
## gradeAtHire:leaveYear                               0.134 0.96972    
## department:leaveYear                                0.518 0.47161    
## gender:leaveYear                                    0.255 0.61329    
## ageAtHire:leaveYear                                 0.143 0.70561    
## gradeAtHire:department:gender                       0.928 0.42648    
## gradeAtHire:department:ageAtHire                    1.041 0.37323    
## gradeAtHire:gender:ageAtHire                        0.304 0.87517    
## department:gender:ageAtHire                         1.017 0.31328    
## gradeAtHire:department:leaveYear                    0.056 0.98279    
## gradeAtHire:gender:leaveYear                        4.042 0.00284 ** 
## department:gender:leaveYear                         0.411 0.52149    
## gradeAtHire:ageAtHire:leaveYear                     2.839 0.02296 *  
## department:ageAtHire:leaveYear                      0.234 0.62889    
## gender:ageAtHire:leaveYear                          0.001 0.97748    
## gradeAtHire:department:gender:ageAtHire             0.351 0.78835    
## gradeAtHire:department:gender:leaveYear             0.277 0.84221    
## gradeAtHire:department:ageAtHire:leaveYear          1.197 0.30932    
## gradeAtHire:gender:ageAtHire:leaveYear              0.150 0.96304    
## department:gender:ageAtHire:leaveYear               0.108 0.74276    
## gradeAtHire:department:gender:ageAtHire:leaveYear   0.284 0.83718    
## Residuals                                                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Main effects of department, gender, and hireYear, but 3-way interaction of GradeAtHire, gender and HireYear

And visualised the three-way interaction I observed in the analysis.

data_term_leave <- data_left %>%
  dplyr::group_by(leaveYear, department, gender) %>%
  dplyr::summarise(leave_left = n(),
                   leave_terminated = sum(term),
                   leave_prop_term = sum(term) / n(),
                   m_ageAtHire = mean(ageAtHire),
                   sd_ageAtHire = sd(ageAtHire, na.rm = T),
                   m_ageWhenLeft = mean(age),
                   sd_ageWhenLeft = sd(age),
                   m_tenure = mean(tenure),
                   sd_tenure = sd(tenure),
                   m_gradeAtHire = mean(as.numeric(gradeAtHire)),
                   sd_gradeAtHire = sd(as.numeric(gradeAtHire))) %>%
  dplyr::select("year" = leaveYear, department, gender, leave_left, leave_terminated, leave_prop_term)


r1 <- ggplot(data_term_leave, aes(x = year, y = leave_prop_term, colour = gender))+
  geom_line(alpha = 0.5)+
  geom_smooth(se = FALSE)+
  facet_wrap(~department)+
  labs(title = "Of those who left, the proportion terminated",
       x = "Year left",
       y = "Proportion terminated",
       colour = "Gender",
       subtitle = "By department, gender and year they left")+
  BIT_theme+
  scale_colour_manual(values = strong_palette)

data_term_hire <- data_left %>%
  dplyr::group_by(hireYear, department, gender) %>%
  dplyr::summarise(hire_left = n(),
                   hire_terminated = sum(term),
                   hire_prop_term = sum(term) / n(),
                   m_ageAtHire = mean(ageAtHire),
                   sd_ageAtHire = sd(ageAtHire, na.rm = T),
                   m_ageWhenLeft = mean(age),
                   sd_ageWhenLeft = sd(age),
                   m_tenure = mean(tenure),
                   sd_tenure = sd(tenure),
                   m_gradeAtHire = mean(as.numeric(gradeAtHire)),
                   sd_gradeAtHire = sd(as.numeric(gradeAtHire))) %>%
  select("year" = hireYear, department, gender, hire_left, hire_terminated, hire_prop_term)


r2 <- ggplot(data_term_hire, aes(x = year, y = hire_prop_term, colour = gender))+
  geom_line(alpha = 0.5)+
  geom_smooth(se = FALSE)+
  facet_wrap(~department)+
  labs(title = "Of those who left, the proportion terminated",
       x = "Year hired",
       y = "Proportion terminated",
     subtitle = "By department, gender and year they were hired")+
  BIT_theme+
  scale_colour_manual(values = strong_palette)

grid.arrange(r1, r2, ncol = 1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

data_rate <- data_timeline %>%
  dplyr::group_by(year, department, gender) %>%
  dplyr::summarise(workforce = n(),
                   left = sum(current != TRUE),
                   joined = sum(new == TRUE),
                   terminated = sum(term, na.rm = T)) %>%
  dplyr::mutate(joining_rate = joined / workforce,
                leaving_rate = left / workforce,
                termination_rate = terminated / workforce,
                term_leaving_rate = terminated / left) %>%
  dplyr::select(year, department, gender, 
                "Joining rate" = joining_rate, 
                "Leaving rate" = leaving_rate, 
                "Termination rate - workforce" = termination_rate, 
                "Terminations - within leavers" = term_leaving_rate) %>%
  tidyr::gather("rate", "value", -year, -department, -gender)

ggplot(data_rate, aes(x = year, y = value, colour = gender))+
  geom_line(alpha = 0.5)+
  geom_smooth(se = FALSE)+
  facet_wrap(department~rate, ncol = 4, scales = "free")+
  labs(title = "Rates of joining, leaving and termination in the workforce",
       x = "Year",
       y = "Rate",
       colour = "Gender",
       subtitle = "As a proportion of the male or female workforce respectively, or of leavers only")+
  BIT_theme+
  scale_colour_manual(values = strong_palette)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'